In [1]:
"""
See "bill_text_preproc.py" file

"""


Out[1]:
'\nSee "bill_text_preproc.py" file\n\n'

In [8]:
import pickle
from bs4 import BeautifulSoup as bs
import pandas

Load Bills


In [3]:
bill_texts_file = "data/bill_texts_filed.pkl"

In [4]:
with open(bill_texts_file, 'rb') as f1:
    bill_texts_filed = pickle.load(f1)

In [9]:
bill_texts_filed = pandas.DataFrame(bill_texts_filed)

In [10]:
bill_texts_filed.shape


Out[10]:
(2098, 4)

In [12]:
bill_texts_filed.head()


Out[12]:
bill house session text
0 1 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con...
1 2 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con...
2 3 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con...
3 4 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con...
4 5 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con...

Extract Data


In [13]:
sample = bill_texts_filed.sample(25)

In [26]:
sample.head()


Out[26]:
bill house session text
952 908 H 2015 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con...
1850 656 S 2015 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con...
1027 983 H 2015 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con...
929 885 H 2015 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con...
1676 482 S 2015 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con...

Get some Examples


In [16]:
ht = sample.text[952]
st = sample.text[1850]

In [17]:
hs = bs(ht, 'html.parser')
ss = bs(st, 'html.parser')

In [18]:
hs = hs.body
ss = ss.body

In [20]:
hs


Out[20]:
<body lang="EN-US">
<div class="WordSection1">
<p align="center" class="nonumber" style="margin-top:24.0pt;text-align:center"><b>GENERAL
ASSEMBLY OF NORTH CAROLINA</b></p>
<p align="center" class="nonumber" style="margin-top:3.0pt;margin-right:0in;
margin-bottom:6.0pt;margin-left:0in;text-align:center"><b>SESSION 2015</b></p>
<p align="left" class="nonumber" style="text-align:left"><b>H                                                                                                                                                   D</b></p>
<p align="center" class="nonumber" style="text-align:center"><b>HOUSE
DRH10271-LRa-136A  (04/02)</b></p>
<p align="center" class="nonumber" style="text-align:center"><b> </b></p>
<p align="center" class="nonumber" style="text-align:center"><b> </b></p>
<p align="center" class="nonumber" style="text-align:center"><b> </b></p>
<table border="1" cellpadding="0" cellspacing="0" class="MsoNormalTable" style="width:6.55in;border-collapse:collapse;border:none" width="629">
<tr>
<td colspan="2" style="width:408.25pt;border:none;
  border-bottom:solid windowtext 1.0pt;padding:0in .05in 0in 5.4pt" valign="top" width="544">
<p align="left" class="nonumber" style="margin-bottom:6.0pt;text-align:left">Short
  Title:        DEM/Emp. Retention Funds/LRC Study.</p>
</td>
<td style="width:63.35pt;border:none;border-bottom:solid windowtext 1.0pt;
  padding:0in .05in 0in 5.4pt" valign="top" width="84">
<p align="right" class="nonumber" style="margin-bottom:6.0pt;text-align:right">(Public)</p>
</td>
</tr>
<tr>
<td style="width:1.0in;border:none;border-bottom:solid windowtext 1.0pt;
  padding:0in .05in 0in 5.4pt" valign="top" width="96">
<p align="left" class="nonumber" style="margin-top:3.0pt;margin-right:0in;
  margin-bottom:3.0pt;margin-left:0in;text-align:left">Sponsors:</p>
</td>
<td colspan="2" style="width:5.55in;border:none;
  border-bottom:solid windowtext 1.0pt;padding:0in .05in 0in 5.4pt" valign="top" width="533">
<p class="nonumber" style="margin-top:3.0pt;margin-right:0in;margin-bottom:
  3.0pt;margin-left:0in">Representative Whitmire.</p>
</td>
</tr>
<tr>
<td style="width:1.0in;border:none;border-bottom:solid windowtext 1.0pt;
  padding:0in .05in 0in 5.4pt" valign="top" width="96">
<p align="left" class="nonumber" style="margin-top:3.0pt;margin-right:0in;
  margin-bottom:3.0pt;margin-left:0in;text-align:left">Referred to:</p>
</td>
<td colspan="2" style="width:5.55in;border:none;
  border-bottom:solid windowtext 1.0pt;padding:0in .05in 0in 5.4pt" valign="top" width="533">
<p class="nonumber" style="margin-top:3.0pt;margin-right:0in;margin-bottom:
  3.0pt;margin-left:0in"> </p>
</td>
</tr>
<tr height="0">
<td style="border:none" width="96"></td>
<td style="border:none" width="448"></td>
<td style="border:none" width="84"></td>
</tr>
</table>
<p align="left" class="nonumber" style="margin-top:6.0pt;text-align:left"> </p>
<p align="center" class="aBase" style="text-align:center">A BILL TO BE ENTITLED</p>
<p class="aLongTitle"><span style='font-family:"Times New Roman","serif"'>AN ACT to
appropriate funds to the department of public safety for retention‑based
salary adjustments for division of emergency management employees and
authorizing the legislative research commission to review whether there should
be established an emergency management preparedness and response fee to support
the work of the division.</span></p>
<p class="aBase">The General Assembly of North Carolina enacts:</p>
<p class="aBillSection"><b>SECTION 1.(a)</b>  There is appropriated from the
General Fund to the Department of Public Safety the sum of four hundred six
thousand four hundred fifty‑nine dollars ($406,459) for the 2015‑2016
fiscal year and the sum of four hundred six thousand four hundred fifty‑nine
dollars ($406,459) for the 2016‑2017 fiscal year to provide salary
adjustments for the retention of employees in the Division of Emergency
Management.</p>
<p class="aBillSection"><b>SECTION 1.(b)</b>  This section becomes effective July
1, 2015.</p>
<p class="aBillSection"><b>SECTION 2.</b>  The Legislative Research Commission
shall study whether an emergency management preparedness and response fee
should be assessed on each homeowners', mobile home owners', tenant homeowners',
and condominium unit owners' property insurance policy issued in this State for
the purpose of funding the operations of the Division of Emergency Management
of the Department of Public Safety. The Legislative Research Commission may
submit a report of its findings and recommendations to the 2015 General
Assembly, prior to the convening of the 2016 Regular Session, by filing the
report with the Speaker of the House of Representatives and the President Pro
Tempore of the Senate, with a copy of the report submitted to the respective
Chairs of the House of Representatives and Senate Appropriations Committees on
Justice and Public Safety.</p>
<p class="aBillSection"><b>SECTION 3.</b>  This act is effective when it becomes
law.</p>
</div>
</body>

In [21]:
ss


Out[21]:
<body lang="EN-US">
<div class="WordSection1">
<p align="center" class="nonumber" style="margin-top:24.0pt;text-align:center"><b>GENERAL
ASSEMBLY OF NORTH CAROLINA</b></p>
<p align="center" class="nonumber" style="margin-top:3.0pt;margin-right:0in;
margin-bottom:6.0pt;margin-left:0in;text-align:center"><b>SESSION 2015</b></p>
<p align="left" class="nonumber" style="text-align:left"><b>S                                                                                                                                                    D</b></p>
<p align="center" class="nonumber" style="text-align:center"><b>SENATE
DRS45321-LRf-105  (03/14)</b></p>
<p align="center" class="nonumber" style="text-align:center"><b> </b></p>
<p align="center" class="nonumber" style="text-align:center"><b> </b></p>
<p align="center" class="nonumber" style="text-align:center"><b> </b></p>
<table border="1" cellpadding="0" cellspacing="0" class="MsoNormalTable" style="width:6.55in;border-collapse:collapse;border:none" width="629">
<tr>
<td colspan="2" style="width:408.25pt;border:none;
  border-bottom:solid windowtext 1.0pt;padding:0in .05in 0in 5.4pt" valign="top" width="544">
<p align="left" class="nonumber" style="margin-bottom:6.0pt;text-align:left">Short
  Title:        WC/2015 Omnibus Law Changes.</p>
</td>
<td style="width:63.35pt;border:none;border-bottom:solid windowtext 1.0pt;
  padding:0in .05in 0in 5.4pt" valign="top" width="84">
<p align="right" class="nonumber" style="margin-bottom:6.0pt;text-align:right">(Public)</p>
</td>
</tr>
<tr>
<td style="width:1.0in;border:none;border-bottom:solid windowtext 1.0pt;
  padding:0in .05in 0in 5.4pt" valign="top" width="96">
<p align="left" class="nonumber" style="margin-top:3.0pt;margin-right:0in;
  margin-bottom:3.0pt;margin-left:0in;text-align:left">Sponsors:</p>
</td>
<td colspan="2" style="width:5.55in;border:none;
  border-bottom:solid windowtext 1.0pt;padding:0in .05in 0in 5.4pt" valign="top" width="533">
<p class="nonumber" style="margin-top:3.0pt;margin-right:0in;margin-bottom:
  3.0pt;margin-left:0in">Senator Lee (Primary Sponsor).</p>
</td>
</tr>
<tr>
<td style="width:1.0in;border:none;border-bottom:solid windowtext 1.0pt;
  padding:0in .05in 0in 5.4pt" valign="top" width="96">
<p align="left" class="nonumber" style="margin-top:3.0pt;margin-right:0in;
  margin-bottom:3.0pt;margin-left:0in;text-align:left">Referred to:</p>
</td>
<td colspan="2" style="width:5.55in;border:none;
  border-bottom:solid windowtext 1.0pt;padding:0in .05in 0in 5.4pt" valign="top" width="533">
<p class="nonumber" style="margin-top:3.0pt;margin-right:0in;margin-bottom:
  3.0pt;margin-left:0in"> </p>
</td>
</tr>
<tr height="0">
<td style="border:none" width="96"></td>
<td style="border:none" width="448"></td>
<td style="border:none" width="84"></td>
</tr>
</table>
<p align="left" class="nonumber" style="margin-top:6.0pt;text-align:left"> </p>
<p align="center" class="aBase" style="text-align:center">A BILL TO BE ENTITLED</p>
<p class="aLongTitle"><span style='font-family:"Times New Roman","serif"'>AN ACT clarifying
the authority and duties of industrial commission fraud investigators and making
technical, conforming, and other changes to the workers' compensation laws of
north carolina.</span></p>
<p class="aBase">The General Assembly of North Carolina enacts:</p>
<p class="aBillSection"><b>SECTION 1.(a)</b>  Article 1 of Chapter 97 of the
General Statutes is amended by adding a new section to read:</p>
<p class="aSection"><span style="font-weight:normal">"</span><u>§ 97‑79.1. 
Authority of Industrial Commission fraud investigators; inspection of records.</u></p>
<p class="aMargin1"><u>(a)</u>        <u>The Commission shall establish a Criminal
Investigation Unit to operate as a law enforcement agency for the enforcement
of this Chapter. Members of the unit shall serve as fraud investigators and
must be sworn law enforcement officers duly appointed and certified by the
North Carolina Criminal Justice Education and Training Standards Commission.</u></p>
<p class="aMargin1"><u>(b)</u>        <u>A fraud investigator employed by the
Commission, who has sworn the oath prescribed for a law enforcement officer, shall
have the following authority:</u></p>
<p class="aBlock1"><u>(1)</u>        <u>To make arrests and take other
investigatory and enforcement actions for both felonies and misdemeanors and to
charge for infractions for violations of the laws of the State, with the
primary responsibility of enforcing the Workers' Compensation Act.</u></p>
<p class="aBlock1"><u>(2)</u>        <u>To act as a State law enforcement officer
with jurisdiction throughout the State.</u></p>
<p class="aBlock1"><u>(3)</u>        <u>To serve and execute orders issued by the
Commission in connection with contempt proceedings. While serving and executing
such an order, a fraud investigator has the same authority and power possessed
by a local law officer or sheriff's deputy when executing an arrest warrant.</u></p>
<p class="aBlock1"><u>(4)</u>        <u>To inspect records of business kept under
G.S. 58‑2‑185 by insurance companies, agents, or brokers doing
any kind of business in this State involving workers' compensation.</u></p>
<p class="aMargin1"><u>(c)</u>        <u>Each insurance company, agent, or broker
keeping records under G.S. 58‑2‑185 shall furnish copies of these
records to the Commission's fraud investigators on demand, and the original
books of records shall be open to the inspection of the Commissioner when
demanded. Any person who refuses, on demand, to exhibit the records of business
as provided by this subsection or who knowingly makes a false statement in
regard to the records when demanded is guilty of a Class 1 misdemeanor.</u>"</p>
<p class="aBillSection"><b>SECTION 1.(b)</b>  G.S. 143‑166.13 is amended
by adding a new subdivision to read:</p>
<p class="aBlock1">"<u>(20)</u>    <u>Sworn State Law‑Enforcement
Officers with the power of arrest, Industrial Commission Fraud Investigators,
Department of Commerce.</u>"</p>
<p class="aBillSection"><b>SECTION 2.</b>  G.S. 97‑88.2(b) reads as
rewritten:</p>
<p class="aMargin1">"(b)      The Commission shall:</p>
<p class="aBlock1">(1)        Perform investigations regarding all cases of
suspected fraud and all violations related to workers' compensation claims, by
or against insurers or self‑funded employers, and refer possible criminal
violations to the <s>appropriate prosecutorial authorities;</s><u>Criminal Investigation
Unit.</u></p>
<p class="aBlock1">(2)        Conduct administrative violation proceedings; and</p>
<p class="aBlock1">(3)        Assess and collect civil penalties and restitution.</p>
<p class="aMargin1">The Commission may employ sworn law enforcement officers <s>duly
appointed and certified through the North Carolina Criminal Justice Education
and Training Standards Commission </s><u>pursuant to G.S. 97‑79.1 </u>to
<u>enforce the laws and </u>conduct the investigations mandated by this <s>subsection.</s><u>section.</u>"</p>
<p class="aBillSection"><b>SECTION 3.</b>  G.S. 97‑73(d) reads as
rewritten:</p>
<p class="aMargin1">"(d)      Safety. – A fee in the amount set by the Industrial
Commission is imposed on an employer for whom the Industrial Commission
provides an educational training program on how to prevent or reduce accidents
or injuries that result in workers' compensation claims or a person for whom
the Industrial Commission provides other educational services. <u>The
Commission may set a reasonable fee imposed for a review of the safety rules. </u>The
fees are departmental receipts."</p>
<p class="aBillSection"><b>SECTION 4.</b>  G.S. 97‑87(c)(5) reads as
rewritten</p>
<p class="aBlock1">"(5)      If any party disputes the decision of the
Commission entered under subdivision (c)(4) of this section, the party may
appeal to the full Commission within 10 days of the entry of the decision of
the Commission. The nonappealing party may file a response within 10 days of
receiving notice of appeal. The notice of appeal shall request one of the
following:</p>
<p class="aBlock2">a.         The Commission reconsider the decision entered
based on the record and any additional evidence that parties submit with the
notice and response.</p>
<p class="aBlock2">b.         A de novo evidentiary hearing before the <s>full </s>Commission."</p>
<p class="aBillSection"><b>SECTION 5.</b>  G.S. 97‑87(c)(7) reads as
rewritten:</p>
<p class="aMargin1">"(c)       When an award or portion of an award provides
for periodic payments to be made on or after the date of the award, a judgment
may be docketed as provided in subsection (d) of this section, in an amount
equal to the sum stated in any Certificate of Accrued Arrearages that is issued
by the Commission under this subsection. If any payment that has accrued after
the date of the award, or after the date specified in the most recent
Certificate of Accrued Arrearages issued under this subsection, is not received
by the claimant when due, the following procedure is available for obtaining a
Certificate of Accrued Arrearages:</p>
<p class="aBlock1">…</p>
<p class="aBlock1">(7)        If a notice of appeal is given under sub‑subdivision
(c)(5)a. of this section, the Commission shall issue its decision within 10
days of the filing of the response under subdivision (c)(5)b. of this section.
If a notice of appeal is given under sub‑subdivision (c)(5) of this
section, the Commission shall either <u>(i) </u>conduct an evidentiary hearing
and issue its decision on the appeal within 90 days of the filing of the <s>response
</s><u>response, or when a response is due if no response is filed, </u>under
subdivision (c)(5) of this section or <u>(ii) </u>deny the request for the
evidentiary hearing and issue its decision within 10 days of the filing of the
response under subdivision (c)(5) of this section. Further appeals are governed
by G.S. 97‑86."</p>
<p class="aBillSection"><b>SECTION 6.</b>  G.S. 97‑92(d) reads as
rewritten:</p>
<p class="aMargin1">"(d)      The said report shall contain the name,
nature, and location of the business of the employer and name, age, sex, <s>and
wages </s><u>wages, if available, </u>and occupation of the injured employee,
and shall state the date and hour of the accident causing injury, the nature
and cause of the injury, and such other information as may be required by the
Commission."</p>
<p class="aBillSection"><b>SECTION 7.</b>  G.S. 97‑101 reads as
rewritten:</p>
<p class="aSection"><span style="font-weight:normal">"</span>§ 97‑101. 
Collection of fines and penalties.</p>
<p class="aMargin1">The Industrial Commission shall have the power by civil
action brought in its own name to enforce the collection of any fines or
penalties provided by this <s>Article, and fines or penalties collected by the
Commission shall become a part of the maintenance fund referred to in
subsection (j) of G.S. 97‑100.</s><u>Article.</u>"</p>
<p class="aBillSection"><b>SECTION 8.</b>  G.S. 97‑26.2 reads as
rewritten:</p>
<p class="aSection"><span style="font-weight:normal">"</span>§ 97‑26.2. 
Reimbursement for prescription <s>drugs </s><u>drugs, prescribed over‑the‑counter
drugs, </u>and professional pharmaceutical services.</p>
<p class="aMargin1">(a)        The reimbursement for prescription <s>drugs </s><u>drugs,
prescribed over‑the‑counter drugs, </u>and professional
pharmaceutical services shall be limited to <u>no greater than </u>ninety‑five
percent (95%) of the average wholesale price (AWP) of the product, calculated
on a per unit basis, as of the date of dispensing.</p>
<p class="aMargin1">(b)        All of the following shall apply to the
reimbursement for prescription drugs and professional pharmaceutical services:</p>
<p class="aBlock1">(1)        A health care provider seeking reimbursement for <s>drugs
dispensed by a physician </s><u>health care provider dispensed prescription
drugs, prescribed over‑the‑counter drugs, and pharmaceutical
services </u>shall include the original manufacturer's National Drug Code (NDC)
number, as assigned by the United States Food and Drug Administration, on <s>the
bills and reports required by this section.</s><u>any billing documents or
invoices issued.</u></p>
<p class="aBlock1">(2)        In no event may a <s>physician </s><u>health care
provider </u>receive reimbursement in excess of ninety‑five percent (95%)
of the AWP of the drugs dispensed by a <s>physician, </s><u>health care
provider, </u>as determined by reference to the original manufacturer's NDC
number.</p>
<p class="aBlock1">(3)        A repackaged NDC number may not be <u>individually </u>used
<u>on any billing documents or invoices issued </u>and will not be considered
the original manufacturer's NDC number. <u>A repackaged NDC number may only
appear in conjunction with the manufacturer's NDC number. </u>If a health care
provider seeking reimbursement for drugs dispensed by a <s>physician </s><u>health
care provider </u>does not include the original manufacturer's NDC number on <s>the
bills and reports required by this section, </s><u>any billing documents or
invoices issued, </u>reimbursement shall be limited to one hundred percent
(100%) of the AWP of the least expensive clinically equivalent drug, calculated
on a per unit basis.</p>
<p class="aBlock1">(4)        No outpatient provider, other than a licensed
pharmacy, may receive reimbursement for a Schedule II controlled substance, as
defined in G.S. 90‑90, or a Schedule III controlled substance, as
defined in G.S. 90‑91, <u>or a Schedule IV controlled substance, as
defined by G.S. 90‑92, </u>dispensed in excess of an initial five‑day
supply, commencing upon the employee's initial treatment following injury. <u>Only
the initial health care provider providing the employee's initial treatment
following injury may seek reimbursement for dispensing controlled substances as
described in this section, and any subsequent dispensing of controlled
substances by another health care provider will be ineligible for reimbursement.
</u>Reimbursement under this subdivision shall be made for the five‑day
supply at the rates provided in this section.</p>
<p class="aBlock1">(5)        For purposes of this section, the term "clinically
equivalent" means a drug has chemical equivalents which, when administered
in the same amounts, will provide essentially the same therapeutic effect as
measured by the control of a symptom or disease."</p>
<p class="aBillSection"><b>SECTION 9.</b>  G.S. 97‑200(a) reads as
rewritten:</p>
<p class="aMargin1">"(a)       A self‑insurer shall not utilize any
claims adjuster unless the adjuster is licensed under <s>G.S. 58‑33‑25.</s><u>G.S. 58‑33‑26.</u>"</p>
<p class="aBillSection"><b>SECTION 10.</b>  This act is effective when it becomes
law.</p>
</div>
</body>

Raw Text Extraction


In [129]:
def cleanRawText(text):
    """
    vtype text: str
    
    rtype text: str / utf8
    """
    text = text.encode('ascii', 'replace')
    text = text.replace(b'?', b'')
    text = text.replace(b'\r\n', b'\n')
    text = text.replace(b'\n', b' ')
    text = text.strip()
    text = text.decode('utf8')
    return(text)

In [162]:
def extractRawText(body):
    """
    vtype body: bs4.element.Tag
    
    rtype raw_text: str
    """
    paras = body.find_all('p')
    texts = [p.text for p in paras]
    texts = [t.strip() for t in texts]
    raw_text = '\n\n'.join(texts)
    raw_text = cleanRawText(raw_text)
    return(raw_text)

In [163]:
hrt = extractRawText(hs)
hrt[:500]


Out[163]:
'GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH10271-LRa-136A (04/02)        Short   Title: DEM/Emp. Retention Funds/LRC Study.  (Public)  Sponsors:  Representative Whitmire.  Referred to:      A BILL TO BE ENTITLED  AN ACT to appropriate funds to the department of public safety for retentionbased salary adjustments for division of emergency management employees and authorizing the legislative research commission to review whether there should be established an emergency managem'

In [164]:
srt = extractRawText(ss)
srt[-500:]


Out[164]:
'ided in this section.  (5) For purposes of this section, the term "clinically equivalent" means a drug has chemical equivalents which, when administered in the same amounts, will provide essentially the same therapeutic effect as measured by the control of a symptom or disease."  SECTION 9. G.S.97200(a) reads as rewritten:  "(a) A selfinsurer shall not utilize any claims adjuster unless the adjuster is licensed under G.S.583325.G.S.583326."  SECTION 10. This act is effective when it becomes law.'

In [165]:
text = hrt

In [166]:
text = text.encode('ascii', 'replace')
text[:500]


Out[166]:
b'GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH10271-LRa-136A (04/02)        Short   Title: DEM/Emp. Retention Funds/LRC Study.  (Public)  Sponsors:  Representative Whitmire.  Referred to:      A BILL TO BE ENTITLED  AN ACT to appropriate funds to the department of public safety for retentionbased salary adjustments for division of emergency management employees and authorizing the legislative research commission to review whether there should be established an emergency managem'

In [167]:
text = text.replace(b'?', b'')
text[:500]


Out[167]:
b'GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH10271-LRa-136A (04/02)        Short   Title: DEM/Emp. Retention Funds/LRC Study.  (Public)  Sponsors:  Representative Whitmire.  Referred to:      A BILL TO BE ENTITLED  AN ACT to appropriate funds to the department of public safety for retentionbased salary adjustments for division of emergency management employees and authorizing the legislative research commission to review whether there should be established an emergency managem'

In [168]:
text = text.replace(b'\r\n', b'\n')
text[:500]


Out[168]:
b'GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH10271-LRa-136A (04/02)        Short   Title: DEM/Emp. Retention Funds/LRC Study.  (Public)  Sponsors:  Representative Whitmire.  Referred to:      A BILL TO BE ENTITLED  AN ACT to appropriate funds to the department of public safety for retentionbased salary adjustments for division of emergency management employees and authorizing the legislative research commission to review whether there should be established an emergency managem'

In [169]:
text = text.replace(b'\n', b' ')
text[:500]


Out[169]:
b'GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH10271-LRa-136A (04/02)        Short   Title: DEM/Emp. Retention Funds/LRC Study.  (Public)  Sponsors:  Representative Whitmire.  Referred to:      A BILL TO BE ENTITLED  AN ACT to appropriate funds to the department of public safety for retentionbased salary adjustments for division of emergency management employees and authorizing the legislative research commission to review whether there should be established an emergency managem'

Test Code


In [170]:
cleanRawText(extractRawText(hs))[:500]


Out[170]:
'GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH10271-LRa-136A (04/02)        Short   Title: DEM/Emp. Retention Funds/LRC Study.  (Public)  Sponsors:  Representative Whitmire.  Referred to:      A BILL TO BE ENTITLED  AN ACT to appropriate funds to the department of public safety for retentionbased salary adjustments for division of emergency management employees and authorizing the legislative research commission to review whether there should be established an emergency managem'

In [171]:
cleanRawText(extractRawText(ss))[:500]


Out[171]:
"GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  S D  SENATE DRS45321-LRf-105 (03/14)        Short   Title: WC/2015 Omnibus Law Changes.  (Public)  Sponsors:  Senator Lee (Primary Sponsor).  Referred to:      A BILL TO BE ENTITLED  AN ACT clarifying the authority and duties of industrial commission fraud investigators and making technical, conforming, and other changes to the workers' compensation laws of north carolina.  The General Assembly of North Carolina enacts:  SECTION 1.(a) Article 1 o"

(Select) Metadata Extraction

"Long Title" / Description


In [138]:
def extractLongTitle(body):
    return(cleanRawText(body.find('p', {'class':'aLongTitle'}).text))

In [139]:
extractLongTitle(hs)


Out[139]:
'AN ACT to appropriate funds to the department of public safety for retentionbased salary adjustments for division of emergency management employees and authorizing the legislative research commission to review whether there should be established an emergency management preparedness and response fee to support the work of the division.'

In [140]:
extractLongTitle(ss)


Out[140]:
"AN ACT clarifying the authority and duties of industrial commission fraud investigators and making technical, conforming, and other changes to the workers' compensation laws of north carolina."

Table Content


In [66]:
[tr.find('p') for tr in hs.find('table').find_all('tr') if tr.find('p')]


Out[66]:
[<p align="left" class="nonumber" style="margin-bottom:6.0pt;text-align:left">Short
   Title:        DEM/Emp. Retention Funds/LRC Study.</p>,
 <p align="left" class="nonumber" style="margin-top:3.0pt;margin-right:0in;
   margin-bottom:3.0pt;margin-left:0in;text-align:left">Sponsors:</p>,
 <p align="left" class="nonumber" style="margin-top:3.0pt;margin-right:0in;
   margin-bottom:3.0pt;margin-left:0in;text-align:left">Referred to:</p>]

In [67]:
[tr.find('p') for tr in ss.find('table').find_all('tr') if tr.find('p')]


Out[67]:
[<p align="left" class="nonumber" style="margin-bottom:6.0pt;text-align:left">Short
   Title:        WC/2015 Omnibus Law Changes.</p>,
 <p align="left" class="nonumber" style="margin-top:3.0pt;margin-right:0in;
   margin-bottom:3.0pt;margin-left:0in;text-align:left">Sponsors:</p>,
 <p align="left" class="nonumber" style="margin-top:3.0pt;margin-right:0in;
   margin-bottom:3.0pt;margin-left:0in;text-align:left">Referred to:</p>]

In [87]:
for row in c:
    attr = cleanRawText(row.find('p').text)
    value = cleanRawText(row.find_all('td')[-1].text)
    if attr.lower().startswith(b'short'):
        d = attr.split(b': ')
        attr = d[0]
        value = d[1].strip() + b' ' + value
    attr = attr.strip().strip(b':')
    print('ATTR: {}; VALUE: {}'.format(attr, value))


ATTR: b'Short   Title'; VALUE: b'DEM/Emp. Retention Funds/LRC Study.  (Public) '
ATTR: b'Sponsors'; VALUE: b' Representative Whitmire. '
ATTR: b'Referred to'; VALUE: b'  '

In [149]:
def extractTableContent(body):
    info = {}
    contents = [tr for tr in body.find('table').find_all('tr') if tr.find('p')]
    for row in contents:
        attr = cleanRawText(row.find('p').text)
        value = cleanRawText(row.find_all('td')[-1].text)
        if attr.lower().startswith('short'):
            d = attr.split(': ')
            attr = d[0]
            value = d[1].strip() + ' ' + value
        attr = attr.strip().strip(':')
        attr = ''.join(attr.split())
        info[attr] = value
    return(info)

In [150]:
extractTableContent(hs)


Out[150]:
{'Referred to': '',
 'Short   Title': 'DEM/Emp. Retention Funds/LRC Study. (Public)',
 'Sponsors': 'Representative Whitmire.'}

In [151]:
extractTableContent(ss)


Out[151]:
{'Referred to': '',
 'Short   Title': 'WC/2015 Omnibus Law Changes. (Public)',
 'Sponsors': 'Senator Lee (Primary Sponsor).'}

Test Bed


In [92]:
soups = [bs(t, 'html.parser') for t in sample.text]

Text


In [141]:
rts = [extractRawText(soup) for soup in soups]

In [142]:
crts = [cleanRawText(rt) for rt in rts]

In [143]:
for crt in crts:
    print(crt[:100])
    print(crt[-100:])
    print('\n')


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH10271-LRa-136A (04/02)        Short 
ions Committees on Justice and Public Safety.  SECTION 3. This act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  S D  SENATE DRS45321-LRf-105 (03/14)        Short 
er is licensed under G.S.583325.G.S.583326."  SECTION 10. This act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE BILL DRH10476-MGf-142 (03/14)        Sh
 (5) Any illicit spirituous liquor.  (6) Mash."  SECTION 6. This act becomes effective July 1, 2016.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH10335-LR-125 (03/27)        Short   
shall be calculated to the nearest cent (1)."  SECTION 2. This act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  S D  SENATE DRS15195-ML-136 (03/10)        Short  
(e), 57D941(b), (d), and (f), and 57D942(b)."  SECTION 2. This act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH20066-ST-18A (01/29)        Short   
ecomes effective May 1, 2015, and applies to zoning ordinance changes adopted on or after that date.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH20143-LL-147 (03/24)        Short   
 act becomes effective July 1, 2015, and applies to eligible retirees who die on or after that date.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  S D  SENATE DRS45308-ML-5C* (01/07)        Short  
t becomes effective December 1, 2015, and applies to any misconduct committed on or after that date.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH30090-LL-112 (3/4)        Short   Ti
, 2015, and applies to persons placed on probation or postrelease supervision on or after that date.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH40369-LR-138 (04/02)        Short   
3. This act becomes effective January 1, 2016, and applies to sick leave used on or after that date.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  S D  SENATE BILL DRS15327-MLf-265A (04/26)        
 this act become effective July 1, 2016. The remainder of this act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  S D  SENATE BILL DRS25292-TAz-9A* (03/15)        S
Human Services on or before December 1, 2016.  SECTION 3. This act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH40397-MGfqq-4A* (11/10)        Short
ately seek to become licensed under this act.  SECTION 5. This act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH40096-ST-4A (10/30)        Short   T
 among the permanent records of that office.  SECTION 15. This act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH20091-MC-99A* (03/09)        Short  
e used for purposes consistent with this act.  SECTION 3. This act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  S D  SENATE BILL DRS15313-RW-18 (04/01)        Sho
equal or greater funding from the applicant."  SECTION 3. This act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE BILL DRH30495-MHa-162A (01/20)        S
4 of this act is effective July 1, 2016. The remainder of the 0act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  S D  SENATE DRS45161-MC-136 (03/17)        Short  
ON 2. This act is effective for investments for taxable years beginning on or after January 1, 2015.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH20101-MK-102 (03/10)        Short   
ON 2. This act is effective when it becomes law and applies beginning with the 20152016 school year.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH40076-LMx-21 (01/14)        Short   
 corporate limits of the Town of Maggie Valley.  SECTION 3. This act becomes effective July 1, 2015.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH20196-LL-149 (03/25)        Short   
h year and January 2 of each subsequent year.  SECTION 2. This act is effective when it becomes law.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH10185-TD-13 (03/02)        Short   T
for all other nonliquid alternative fuels."  SECTION 3. This section becomes effective July 1, 2015.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH30049-MH-27 (01/20)        Short   T
ission appointments made on or after that date. The remainder of this act is effective July 1, 2015.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  S D  SENATE JOINT RESOLUTION DRSJR35301-LG-117 (06
esolution to the family of Harris Blake.  SECTION 4. This resolution is effective upon ratification.


GENERAL ASSEMBLY OF NORTH CAROLINA  SESSION 2015  H D  HOUSE DRH30316-MLf-229 (04/06)        Short  
rules pursuant to Section 8 of this act. The remainder of this act is effective when it becomes law.


Long Titles


In [144]:
lts = [extractLongTitle(soup) for soup in soups]

In [147]:
for lt in lts:
    print(lt[:70])
    print('\n')


AN ACT to appropriate funds to the department of public safety for ret


AN ACT clarifying the authority and duties of industrial commission fr


AN ACT ALLOWING PATIENTS WITH terminal or chronic ILLNESS TO LAWFULLY 


AN ACT to allow a local government to set the minimum wage within its 


AN ACT to make a technical and clarifying change to the limited liabil


AN ACT to amend the process by which the city councils receive citizen


AN ACT to increase the contributory death benefit payable on behalf of


AN ACT to (1) prohibit the use of discriminatory profiling by law enfo


AN ACT to amend provisions of the justice reinvestment act.


AN ACT to enact the eligible leave for employee caregiving time act.


AN ACT to include per transaction rates paid to license plate agency c


AN ACT to direct the division of child development and early education


AN ACT to establish licensure and education standards for the practice


AN ACT to provide FOR fouryear terms for members of the general assemb


AN ACT to enact the corner store initiative act to assist healthy food


AN ACT to CLARIFY that the freight rail and rail crossing safety impro


AN ACT to provide for a referendum to limit marine net fishing.


AN ACT to reenact the tax credit for qualified business investments.


AN ACT to allow local boards of education to set a school calendar tha


AN ACT to remove certain described property from the corporate limits 


AN ACT TO provide that STATE WILDLIFE LAWS do not apply TO OPOSSUMS BE


AN ACT to equalize the taxation of LIQUEFIED propane gas when used as 


AN ACT TO update the membership of the FIRE AND RESCUE COMMISSION to r


A JOINT RESOLUTION honoring the life and memory of harris blake, forme


AN ACT to authorize and regulate the sale of antique spirituous liquor


Info Tables


In [152]:
tabs = [extractTableContent(soup) for soup in soups]

In [154]:
for tab in tabs:
    print(tab['Sponsors'])


Representative Whitmire.
Senator Lee (Primary Sponsor).
Representative Alexander.
Representative L. Hall.
Senators Barringer and Lee (Primary Sponsors).
Representatives Stam, Goodman, Jackson, and Fraley   (Primary Sponsors).
Representative L. Bell.
Senators McKissick and Bryant (Primary Sponsors).
Representatives Faircloth, Daughtry, Boles, and Hurley   (Primary Sponsors).
Representatives R. Turner and Meyer (Primary   Sponsors).
Senator Tillman (Primary Sponsor).
Senator Pate (Primary Sponsor).
Representatives Collins and Fisher (Primary Sponsors).
Representatives Warren, Hardister, Malone, and Glazier   (Primary Sponsors).
Representatives Holley, Whitmire, B. Brown, and   Lambeth (Primary Sponsors).
Senators J. Davis and Rabon (Primary Sponsors).
Representative W. Richardson.
Senator Lowe (Primary Sponsor).
Representative Holloway.
Representative Presnell.
Representatives West, Hager, McElraft, and Lucas   (Primary Sponsors).
Representative Collins.
Representatives Ross, Saine, Boles, and J. Bell   (Primary Sponsors).
Senator Tillman (Primary Sponsor).
Representatives Hager and J. Bell (Primary Sponsors).

In [156]:
help(bill_texts_filed.apply)


Help on method apply in module pandas.core.frame:

apply(func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds) method of pandas.core.frame.DataFrame instance
    Applies function along input axis of DataFrame.
    
    Objects passed to functions are Series objects having index
    either the DataFrame's index (axis=0) or the columns (axis=1).
    Return type depends on whether passed function aggregates, or the
    reduce argument if the DataFrame is empty.
    
    Parameters
    ----------
    func : function
        Function to apply to each column/row
    axis : {0 or 'index', 1 or 'columns'}, default 0
        * 0 or 'index': apply function to each column
        * 1 or 'columns': apply function to each row
    broadcast : boolean, default False
        For aggregation functions, return object of same size with values
        propagated
    raw : boolean, default False
        If False, convert each row or column into a Series. If raw=True the
        passed function will receive ndarray objects instead. If you are
        just applying a NumPy reduction function this will achieve much
        better performance
    reduce : boolean or None, default None
        Try to apply reduction procedures. If the DataFrame is empty,
        apply will use reduce to determine whether the result should be a
        Series or a DataFrame. If reduce is None (the default), apply's
        return value will be guessed by calling func an empty Series (note:
        while guessing, exceptions raised by func will be ignored). If
        reduce is True a Series will always be returned, and if False a
        DataFrame will always be returned.
    args : tuple
        Positional arguments to pass to function in addition to the
        array/series
    Additional keyword arguments will be passed as keywords to the function
    
    Notes
    -----
    In the current implementation apply calls func twice on the
    first column/row to decide whether it can take a fast or slow
    code path. This can lead to unexpected behavior if func has
    side-effects, as they will take effect twice for the first
    column/row.
    
    Examples
    --------
    >>> df.apply(numpy.sqrt) # returns DataFrame
    >>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0)
    >>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1)
    
    See also
    --------
    DataFrame.applymap: For elementwise operations
    
    Returns
    -------
    applied : Series or DataFrame

Process


In [158]:
bill_texts_filed['soup'] = bill_texts_filed.apply(lambda x: bs(x.text, 'html.parser'), axis=1)

In [159]:
bill_texts_filed.head()


Out[159]:
bill house session text soup
0 1 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con... <html> <head> <meta content="text/html; charse...
1 2 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con... <html> <head> <meta content="text/html; charse...
2 3 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con... <html> <head> <meta content="text/html; charse...
3 4 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con... <html> <head> <meta content="text/html; charse...
4 5 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con... <html> <head> <meta content="text/html; charse...

In [179]:
bill_texts_filed['content'] = bill_texts_filed.apply(lambda x: extractRawText(x['soup'].body), axis=1)

In [180]:
bill_texts_filed['long_title'] = bill_texts_filed.apply(lambda x: extractLongTitle(x['soup'].body), axis=1)

In [181]:
bill_texts_filed['table_info'] = bill_texts_filed.apply(lambda x: extractTableContent(x['soup'].body), axis=1)

In [182]:
bill_texts_filed.head()


Out[182]:
bill house session text soup content long_title table_info
0 1 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con... <html> <head> <meta content="text/html; charse... GENERAL ASSEMBLY OF NORTH CAROLINA FOURTH EXT... A HOUSE RESOLUTION adopting the permanent rule... {'Referred to': '', 'Sponsors': 'Representativ...
1 2 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con... <html> <head> <meta content="text/html; charse... GENERAL ASSEMBLY OF NORTH CAROLINA FOURTH EXT... A JOINT RESOLUTIOn providing for adjournment s... {'Referred to': '', 'Sponsors': 'Representativ...
2 3 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con... <html> <head> <meta content="text/html; charse... GENERAL ASSEMBLY OF NORTH CAROLINA FOURTH EXT... AN ACT to provide further REGULATORY RELIEF TO... {'Referred to': '', 'Short Title': 'Regulato...
3 4 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con... <html> <head> <meta content="text/html; charse... GENERAL ASSEMBLY OF NORTH CAROLINA FOURTH EXT... AN ACT directing the department of transportat... {'Referred to': '', 'Short Title': 'Terminat...
4 5 H 2015E4 b'<html>\r\n\r\n<head>\r\n<meta http-equiv=Con... <html> <head> <meta content="text/html; charse... GENERAL ASSEMBLY OF NORTH CAROLINA FOURTH EXT... AN ACT to clarify the service area for communi... {'Referred to': '', 'Short Title': 'Municipa...

In [183]:
bill_texts_filed_content = bill_texts_filed[['session', 'house', 'bill', 'content', 'long_title', 'table_info']]

In [184]:
bill_texts_filed_content.head()


Out[184]:
session house bill content long_title table_info
0 2015E4 H 1 GENERAL ASSEMBLY OF NORTH CAROLINA FOURTH EXT... A HOUSE RESOLUTION adopting the permanent rule... {'Referred to': '', 'Sponsors': 'Representativ...
1 2015E4 H 2 GENERAL ASSEMBLY OF NORTH CAROLINA FOURTH EXT... A JOINT RESOLUTIOn providing for adjournment s... {'Referred to': '', 'Sponsors': 'Representativ...
2 2015E4 H 3 GENERAL ASSEMBLY OF NORTH CAROLINA FOURTH EXT... AN ACT to provide further REGULATORY RELIEF TO... {'Referred to': '', 'Short Title': 'Regulato...
3 2015E4 H 4 GENERAL ASSEMBLY OF NORTH CAROLINA FOURTH EXT... AN ACT directing the department of transportat... {'Referred to': '', 'Short Title': 'Terminat...
4 2015E4 H 5 GENERAL ASSEMBLY OF NORTH CAROLINA FOURTH EXT... AN ACT to clarify the service area for communi... {'Referred to': '', 'Short Title': 'Municipa...

In [185]:
with open('data/bill_texts_filed_content.pkl', 'wb') as f1:
    pickle.dump(bill_texts_filed_content, f1)

In [ ]: